{
   "cells": [
      {
         "cell_type": "markdown",
         "id": "7a9f093e-e027-405b-ae3d-17dda9e30cd0",
         "metadata": {},
         "source": [
            "# NYC Wikipedia Embeddings Demo"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "cadae9f2",
         "metadata": {},
         "outputs": [],
         "source": [
            "import logging\n",
            "import sys\n",
            "\n",
            "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
            "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
         ]
      },
      {
         "cell_type": "markdown",
         "id": "3e594a62-110e-40b3-ad1e-c99f49a4e537",
         "metadata": {},
         "source": [
            "Demonstrate embedding capabilities in GPTTreeIndex and GPTListIndex"
         ]
      },
      {
         "cell_type": "markdown",
         "id": "b145f093-afb0-46b8-a81f-466af8478439",
         "metadata": {},
         "source": [
            "### Setup + Data Prep"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "d038dcc1",
         "metadata": {},
         "outputs": [],
         "source": [
            "import logging\n",
            "import sys\n",
            "\n",
            "logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)\n",
            "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "b4b4387b-413e-4016-ba1e-88b3d9410a38",
         "metadata": {},
         "outputs": [],
         "source": [
            "# fetch \"New York City\" page from Wikipedia\n",
            "from pathlib import Path\n",
            "\n",
            "import requests\n",
            "response = requests.get(\n",
            "    'https://en.wikipedia.org/w/api.php',\n",
            "    params={\n",
            "        'action': 'query',\n",
            "        'format': 'json',\n",
            "        'titles': 'New York City',\n",
            "        'prop': 'extracts',\n",
            "        # 'exintro': True,\n",
            "        'explaintext': True,\n",
            "    }\n",
            ").json()\n",
            "page = next(iter(response['query']['pages'].values()))\n",
            "nyc_text = page['extract']\n",
            "\n",
            "data_path = Path('data')\n",
            "if not data_path.exists():\n",
            "    Path.mkdir(data_path)\n",
            "\n",
            "with open('data/nyc_text.txt', 'w') as fp:\n",
            "    fp.write(nyc_text)"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "f1a9eb90-335c-4214-8bb6-fd1edbe3ccbd",
         "metadata": {},
         "outputs": [],
         "source": [
            "# My OpenAI Key\n",
            "import os\n",
            "os.environ['OPENAI_API_KEY'] = \"INSERT OPENAI KEY\""
         ]
      },
      {
         "cell_type": "markdown",
         "id": "def4eca7-ba03-48e2-b18f-fd669b91a5fc",
         "metadata": {},
         "source": [
            "### GPTTreeIndex - Embedding-based Query"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "8d0b2364-4806-4656-81e7-3f6e4b910b5b",
         "metadata": {},
         "outputs": [
            {
               "name": "stderr",
               "output_type": "stream",
               "text": [
                  "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
               ]
            }
         ],
         "source": [
            "from llama_index import GPTTreeIndex, SimpleDirectoryReader\n",
            "from IPython.display import Markdown"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "1298bbb4-c99e-431e-93ef-eb32c0a2fc2a",
         "metadata": {
            "tags": []
         },
         "outputs": [],
         "source": [
            "documents = SimpleDirectoryReader('data').load_data()\n",
            "index = GPTTreeIndex.from_documents(documents)"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "0b4fe9b6-5762-4e86-b51e-aac45d3ecdb1",
         "metadata": {},
         "outputs": [],
         "source": [
            "index.save_to_disk('index.json')"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "5eec265d-211b-4f26-b05b-5b4e7072bc6e",
         "metadata": {},
         "outputs": [],
         "source": [
            "new_index = GPTTreeIndex.load_from_disk('index.json')"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "68c9ebfe-b1b6-4f4e-9278-174346de8c90",
         "metadata": {
            "tags": []
         },
         "outputs": [],
         "source": [
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "e1000018-18de-410d-b6d9-c66bf37ccf1d",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "4fc3f18a-0ef9-453c-acf8-7aedd784cdcf",
         "metadata": {},
         "outputs": [],
         "source": [
            "response = new_index.query(\n",
            "    \"What battles took place in New York City in the American Revolution?\", \n",
            "    mode=\"embedding\"\n",
            ")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "5588289b-9fdc-4b86-bab9-808c97be05e1",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "53265fd4-da98-4cf9-abfb-3f76105fd2ff",
         "metadata": {},
         "outputs": [],
         "source": [
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "bc08060f-b031-4dc5-a980-427dd2407b5d",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "markdown",
         "id": "63009734-deda-4159-9f2b-0af19720e913",
         "metadata": {},
         "source": [
            "### GPTListIndex - Embedding-based Query"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "fd8920ae-8115-457c-b092-21e50cc3bcc0",
         "metadata": {},
         "outputs": [],
         "source": [
            "from llama_index import GPTListIndex, SimpleDirectoryReader\n",
            "from IPython.display import Markdown"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "27c8bbee-daf5-494d-ba66-b60142592a96",
         "metadata": {},
         "outputs": [],
         "source": [
            "documents = SimpleDirectoryReader('data').load_data()\n",
            "index = GPTListIndex.from_documents(documents)"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "c3d5a589-ee75-40bd-9529-75f693874ed7",
         "metadata": {},
         "outputs": [],
         "source": [
            "index.save_to_disk('index_list_emb.json')"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "9dfbef52-50fb-46ca-b82b-c44cfa2301ef",
         "metadata": {},
         "outputs": [],
         "source": [
            "# try loading\n",
            "new_index = GPTListIndex.load_from_disk('index_list_emb.json')"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "2cbf24c2-060e-4216-9188-a6746af1830d",
         "metadata": {
            "tags": []
         },
         "outputs": [],
         "source": [
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\"What is the name of the professional women's basketball team in New York City?\", mode=\"embedding\")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "14e1b19f-fbf7-49fd-a96f-cbb37bafd498",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "48b86c8d-9149-4395-9d52-6070597c814d",
         "metadata": {},
         "outputs": [],
         "source": [
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\"What battles took place in New York City in the American Revolution?\", mode=\"embedding\")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "57fbd90c-a8d3-4738-8531-e8f48a953167",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "7ab01446-9b07-4222-a577-eeb4617ce4fc",
         "metadata": {},
         "outputs": [],
         "source": [
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\"What are the airports in New York City?\", mode=\"embedding\")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "091afaea-a61e-4a7c-b2f1-7df387380b8b",
         "metadata": {},
         "outputs": [],
         "source": [
            "display(Markdown(f\"<b>{response}</b>\"))"
         ]
      },
      {
         "cell_type": "markdown",
         "id": "aca03087-d6cc-4d87-8ec6-185fa03d9fea",
         "metadata": {},
         "source": [
            "## Try out other embeddings! \n",
            "(courtesy of langchain)"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "27c24411-7049-45c7-862c-0857c03db580",
         "metadata": {},
         "outputs": [],
         "source": [
            "from llama_index import GPTListIndex, SimpleDirectoryReader, ServiceContext\n",
            "from IPython.display import Markdown"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "b9ff1944-a06a-4b05-adae-a2ef25e74e8b",
         "metadata": {},
         "outputs": [],
         "source": [
            "# load in HF embedding model from langchain\n",
            "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n",
            "from llama_index import LangchainEmbedding\n",
            "embed_model = LangchainEmbedding(HuggingFaceEmbeddings())"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "3049d517-05db-459b-9e32-711e380fda67",
         "metadata": {},
         "outputs": [],
         "source": [
            "# try loading index\n",
            "new_index = GPTListIndex.load_from_disk('index_list_emb.json')"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "1494cabb-0123-408a-9d81-8e02db9b3acd",
         "metadata": {},
         "outputs": [],
         "source": [
            "# configure\n",
            "service_context = ServiceContext.from_defaults(embed_model=embed_model)\n",
            "\n",
            "# set Logging to DEBUG for more detailed outputs\n",
            "response = new_index.query(\n",
            "    \"What is the name of the professional women's basketball team in New York City?\", \n",
            "    mode=\"embedding\", \n",
            "    service_context=service_context, \n",
            ")"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "4d96a2e7-4eb1-474e-b855-eca3efed1bad",
         "metadata": {},
         "outputs": [],
         "source": [
            "response"
         ]
      },
      {
         "cell_type": "code",
         "execution_count": null,
         "id": "80510d3a-8bf8-47f2-b1d4-3d1bd0d5a1bb",
         "metadata": {},
         "outputs": [],
         "source": []
      }
   ],
   "metadata": {
      "kernelspec": {
         "display_name": "Python 3 (ipykernel)",
         "language": "python",
         "name": "python3"
      },
      "language_info": {
         "codemirror_mode": {
            "name": "ipython",
            "version": 3
         },
         "file_extension": ".py",
         "mimetype": "text/x-python",
         "name": "python",
         "nbconvert_exporter": "python",
         "pygments_lexer": "ipython3",
         "version": "3.10.9"
      }
   },
   "nbformat": 4,
   "nbformat_minor": 5
}